# Purpose: Produce the levels-matching movers plots for the language outcome.
#           This is measured only ONCE at the end of the sample.

source("constants.R")

#####################################################
## 1. Regressions matching only by cohort X region ##
#####################################################

##### Match only by cohort X region ######
# See the README for instructions on how to access this data
dat_in.split <- read_csv("input/movers_language_match1.csv")

dat.split <- dat_in.split %>%
  filter(!is.na(pct_produ_de)) %>%
  mutate(share_of_time_in_max_nuts3 = quarters_in_max_nuts3/quarters_in_de_unfixed)

### Create the regression columns
col1_dat <- dat.split %>%
  group_by(first_year_in_de) %>%
  mutate(n = n()) %>%  filter(n > 1) %>% ungroup()
col1 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de)  | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col1_dat)
ymean1 <- mean(col1_dat$produ_any_de)


col2_dat <- dat.split %>%
  filter(share_of_time_in_max_nuts3 < 0.75) %>% group_by(first_year_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col2 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col2_dat)
ymean2 <- mean(col2_dat$produ_any_de)


col3_dat <- dat.split %>%
  filter(share_of_time_in_max_nuts3 < 0.6) %>% group_by(first_year_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col3 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, col3_dat)
ymean3 <- mean(col3_dat$produ_any_de)


col4_dat <- dat.split %>%
  group_by(first_year_in_de, curr_nuts3) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col4 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de):as.factor(curr_nuts3) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col4_dat)
ymean4 <- mean(col4_dat$produ_any_de)


col5_dat <- dat.split %>%
  group_by(first_year_in_de, curr_nuts3, first_nuts3_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col5 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de):as.factor(curr_nuts3):as.factor(first_nuts3_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col5_dat)
ymean5 <- mean(col5_dat$produ_any_de)


stargazer::stargazer(col1, col2, col3, col4, col5,
                     keep.stat = c("n","rsq"),
                     add.lines=list(
                       c("FEs", "Cohort", "Cohort", "Cohort", "Cohort*Curr NUTS3", "Cohort*Curr NUTS3*First NUTS3"),
                       c("Sample", "", "Less 75% in Max NUTS3", "Less 60% in Max NUTS3", "", ""),
                       c("ymean", round(ymean1,3), round(ymean2,3), round(ymean3,3), round(ymean4,3), round(ymean5,3))),
                     type="html",
                     out="output/movers_language_regressions.doc")

binscatter(dat.split, c("wtd_avg_avg1","wtd_avg_avg2"), "produ_any_de", fes="first_year_in_de") +
  labs(x="Predicted Prob. of Using German", y="Actual Share Using German")
ggsave("output/sy_movers_language_binscatter_COHORT_FE.png", last_plot(), width=5, height=5)


################################################################
## 2. Regressions matching by cohort X region X age X gender  ##
################################################################

##### Match only by cohort X region X age X gender ######
# See the README for instructions on how to access this data
dat_in.split <- read_csv("input/movers_language_match2.csv")

dat.split <- dat_in.split %>%
  filter(!is.na(pct_produ_de)) %>%
  mutate(share_of_time_in_max_nuts3 = quarters_in_max_nuts3/quarters_in_de_unfixed)

### Create the regression columns
col1_dat <- dat.split %>%
  group_by(first_year_in_de) %>%
  mutate(n = n()) %>%  filter(n > 1) %>% ungroup()
col1 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de)  | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col1_dat)
ymean1 <- mean(col1_dat$produ_any_de)


col2_dat <- dat.split %>%
  filter(share_of_time_in_max_nuts3 < 0.75) %>% group_by(first_year_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col2 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col2_dat)
ymean2 <- mean(col2_dat$produ_any_de)


col3_dat <- dat.split %>%
  filter(share_of_time_in_max_nuts3 < 0.6) %>% group_by(first_year_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col3 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, col3_dat)
ymean3 <- mean(col3_dat$produ_any_de)


col4_dat <- dat.split %>%
  group_by(first_year_in_de, curr_nuts3) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col4 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de):as.factor(curr_nuts3) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col4_dat)
ymean4 <- mean(col4_dat$produ_any_de)


col5_dat <- dat.split %>%
  group_by(first_year_in_de, curr_nuts3, first_nuts3_in_de) %>%
  mutate(n = n()) %>% filter(n > 1) %>% ungroup()
col5 <- felm(produ_any_de ~ 1 | as.factor(first_year_in_de):as.factor(curr_nuts3):as.factor(first_nuts3_in_de) | (wtd_avg_avg2~wtd_avg_avg1) | 0, dat=col5_dat)
ymean5 <- mean(col5_dat$produ_any_de)


stargazer::stargazer(col1, col2, col3, col4, col5,
                     keep.stat = c("n","rsq"),
                     add.lines=list(
                       c("FEs", "Cohort", "Cohort", "Cohort", "Cohort*Curr NUTS3", "Cohort*Curr NUTS3*First NUTS3"),
                       c("Sample", "", "Less 75% in Max NUTS3", "Less 60% in Max NUTS3", "", ""),
                       c("ymean", round(ymean1,3), round(ymean2,3), round(ymean3,3), round(ymean4,3), round(ymean5,3))),
                     type="html",
                     out="output/movers_language_regressions_AGE_GENDER.doc")

binscatter(dat.split, c("wtd_avg_avg1","wtd_avg_avg2"), "produ_any_de", fes="first_year_in_de") +
  labs(x="Predicted Prob. of Using German", y="Actual Share Using German")
ggsave("output/sy_movers_language_binscatter_AGE_GENDER__COHORT_FE.png", last_plot(), width=5, height=5)
